In [4]:
import requests
import json
import os
import pandas as pd
# For parsing URLs:
from urllib.parse import quote_plus
import nltk
from langdetect import detect


In [5]:
def search_cc_index(url, index_name):
    """
    Search the Common Crawl Index for a given URL.
 
    This function queries the Common Crawl Index <a href="https://www.jcchouinard.com/api/">API</a> to find records related to the specified URL. 
    It uses the index specified by `index_name` to retrieve the data and returns a list of JSON objects, 
    each representing a record from the index.
 
    Arguments:
        url (str): The URL to search for in the Common Crawl Index.
        index_name (str): The name of the Common Crawl Index to search (e.g., "CC-MAIN-2024-10").
 
    Returns:
        list: A list of JSON objects representing records found in the Common Crawl Index. 
              Returns None if the request fails or no records are found.
 
    Example:
        >>> search_cc_index("example.com", "CC-MAIN-2024-10")
        [{...}, {...}, ...]
    """
    encoded_url = quote_plus(url)
    index_url = f'http://index.commoncrawl.org/{index_name}-index?url={encoded_url}&output=json'
    response = requests.get(index_url)
 
    if response.status_code == 200:
        records = response.text.strip().split('\n')
        return [json.loads(record) for record in records]
    else:
        return None
 

In [6]:
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
import sys
import nltk
from langdetect import detect
import re
import pandas as pd

# Regular expression to detect non-Latin characters
non_latin_pattern = re.compile(r'[^\x00-\x7F]+')

allowed_domains = {'com', 'gov', 'edu', 'co', 'uk', 'net', 'mil', 'ai', 'ca'}

def is_english(text):
    try:
        return detect(text) == 'en' and not non_latin_pattern.search(text)
    except:
        return False

def is_latin_not_english(text):
    try:
        return detect(text) != 'en' and not non_latin_pattern.search(text)
    except:
        return False


def get_last_domain_part(url:str):
    return url.split("/")[2].split(".")[-1]

def is_error_response(input:str):
    block_words = {"404"}
    input = input.lower()
    words = input.split()
    for word in words:
        if word in block_words:
            return True
    if input.find("no response") >=0:
        return True
    if input.find("not found") >=0:
        return True
    return False
    

def extract_english_files(warc_file):
    """
    Returns a list of dictionaries with url description and title keys from 
    english web pges in a WARC file
    """
    count = 0
    results = []
    with open(warc_file, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response' and 'text/html' in record.http_headers.get('Content-Type', ''):
                payload = record.content_stream().read()
                soup = BeautifulSoup(payload, 'html.parser')
                
                html_tag = soup.find('html')
                if html_tag and html_tag.get('lang', '').startswith('en'):
                    url = record.rec_headers.get('WARC-Target-URI')
                    if not get_last_domain_part(url) in allowed_domains:
                        continue
                    title_tag = soup.find('title')
                    title = title_tag.text.strip() if title_tag else None
                    if title is None or is_error_response(title):
                        continue
                    og_desc_tag = soup.find('meta', attrs={'property': 'og:description'})
                    description = og_desc_tag.get('content', '').strip() if og_desc_tag else None
                    if description is None:
                        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
                        description = meta_desc_tag.get('content', '').strip() if meta_desc_tag else 'No Description'
                    
                    if not is_english(title):
                        continue
                    if count%20 == 0:
                        print(count)
                    count += 1
                    results.append({"url": url, "description": description, "title": title})
    return results

def extract_non_english_latin(warc_file):
    """
    Extracts files that have latin charsets in the title but a language detector determines as non-english
    This is good for exracting error pages, pages in non-english languages
    """
    count = 0
    results = []
    with open(warc_file, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response' and 'text/html' in record.http_headers.get('Content-Type', ''):
                payload = record.content_stream().read()
                soup = BeautifulSoup(payload, 'html.parser')
                
                html_tag = soup.find('html')
                if html_tag and not html_tag.get('lang', '').startswith('en'):
                    url = record.rec_headers.get('WARC-Target-URI')
                    if not get_last_domain_part(url) in allowed_domains:
                        continue
                    title_tag = soup.find('title')
                    title = title_tag.text.strip() if title_tag else None
                    if title is None or is_error_response(title):
                        continue
                    og_desc_tag = soup.find('meta', attrs={'property': 'og:description'})
                    description = og_desc_tag.get('content', '').strip() if og_desc_tag else None
                    if description is None:
                        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
                        description = meta_desc_tag.get('content', '').strip() if meta_desc_tag else 'No Description'
                    
                    if not is_latin_not_english(title):
                        continue
                    if count%20 == 0:
                        print(count)
                    count += 1
                    if count > 800:
                        break
                    results.append({"url": url, "description": description, "title": title})
    return results    



In [7]:
!pwd

/Users/Rrando/Documents/GitHub/smart-tab-grouping/notebooks


In [8]:
PREFIX_PATH = "/Users/Rrando/Documents/common_crawl/CC-MAIN-20241102010035-20241102040035-"
file_list = ["00000", "00002", "00003", "00004", "00005"]

for fname in file_list:
    r = extract_english_files(PREFIX_PATH + fname + ".warc.gz")
    cc_corpus = pd.DataFrame(r)
    cc_corpus.to_csv(f"../data/external/common_crawl_{fname}.csv")

0
20


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


40


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


60
80



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(payload, 'html.parser')


100


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


140


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


160
180
200


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1140


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1380
1400
1420
1440
1460
1480
1500
1520


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1780
1800
1820


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1840
1860
1880
1900
1920
1940
1960
1980


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2000
2020
2040
2060
2080
2100
2120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2140
2160
2180
2200
2220
2240


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960
3980
4000
4020
4040
4060
4080
4100
4120
4140
4160
4180
4200
4220
4240
4260
4280
4300
4320
4340
4360
4380
4400
4420
4440
4460
4480


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4500
4520
4540
4560
4580
4600
4620
4640
4660
4680
4700
4720
4740
4760


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4780
4800
4820
0
20


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


40
60


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


80
100
120
140


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


160
180
200
220
240
260
280
300
320
340
360
380


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


960
980
1000
1020


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1040
1060
1080
1100
1120
1140
1160


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1180
1200
1220


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1660
1680


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2640
2660
2680
2700
2720
2740
2760
2780
2800


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2820


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2840


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2860
2880


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960
3980
4000
4020
4040
4060
4080
4100
4120
4140
4160
4180
4200
4220
4240
4260
4280
4300
4320
4340
4360
4380
4400
4420
4440
4460
4480
4500
4520
4540
4560
4580
4600
4620
4640
4660
4680
4700
4720
4740
4760
4780
4800
0


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


20
40
60


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


80


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


100
120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


140


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


160


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


180


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


200


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


220
240
260
280
300


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


320
340
360


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1020
1040
1060
1080
1100


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1680
1700
1720
1740


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3220
3240
3260
3280


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3840
3860
3880
3900
3920
3940
3960
3980


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4000
4020
4040
4060
4080
4100
4120
4140
4160
4180
4200
4220
4240
4260
4280
4300
4320
4340


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4360
4380
4400
4420
4440
4460
4480


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4500
4520
4540
4560
4580
4600
4620
4640
4660
4680
4700
4720
4740


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4760
4780
4800
4820
4840
4860
4880
4900
0
20


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


40
60


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


80
100
120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


140


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


160


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


180


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


200


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


860
880
900


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


920
940
960
980
1000
1020
1040
1060
1080
1100


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1480
1500
1520
1540
1560
1580
1600
1620
1640
1660


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2040
2060
2080
2100
2120
2140
2160
2180
2200


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2220


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2700
2720
2740


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960
3980
4000
4020
4040
4060
4080
4100
4120
4140
4160
4180
4200
4220
4240
4260
4280
4300
4320
4340
4360
4380
4400
4420
4440
4460
4480
4500
4520
4540
4560
4580
4600
4620
4640
4660
4680
4700
4720
4740
4760
4780
4800
4820
4840


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4860
4880
4900
4920
0


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


20
40
60


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


80


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


100
120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


140


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


160


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


180
200


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


220
240


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


260
280
300
320
340


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1020
1040


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2220
2240
2260
2280
2300
2320
2340
2360


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3460
3480
3500
3520
3540
3560
3580
3600
3620
3640


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3660
3680
3700
3720


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3980
4000
4020
4040
4060
4080
4100
4120
4140
4160
4180


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4200
4220
4240
4260
4280
4300
4320
4340
4360
4380
4400
4420
4440
4460
4480
4500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


4520
4540
4560
4580
4600
4620
4640
4660
4680
4700
4720
4740
4760
4780
4800
4820
4840
4860


In [None]:
r = extract_non_english_latin('/Users/Rrando/crawl/out/CC-MAIN-20250218081924-20250218111924-00893.warc.gz')
cc_corpus = pd.DataFrame(r)
cc_corpus.to_csv("../data/external/common_crawl_non_english.csv")